In [1]:
# Import all libraries needed for the tutorial

# General syntax to import specific functions in a library: 
##from (library) import (specific library function)
from pandas import DataFrame, read_csv

# General syntax to import a library but no functions: 
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib.pyplot as plt
import seaborn 
from sklearn import preprocessing  # to normalise existing X
from nltk.corpus import stopwords
import folium as folium
from sklearn.preprocessing import label_binarize
from collections import Counter
from sklearn.metrics import classification_report
from itertools import cycle
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
from scipy import interp
import pylab as pl
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from os import listdir
from os.path import isfile, join
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import GridSearchCV 
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics.pairwise
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import roc_auc_score
from wordcloud import WordCloud
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import warnings
# Enable inline plotting
%matplotlib inline
unable to import 'smart_open.gcs', disabling that module
In [2]:
business_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/business"
entertainment_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/entertainment"
politics_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/politics"
sport_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/sport"
tech_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/tech"
In [3]:
unique_num=0
col_names =  ['Id', 'Title', 'Content','Category']
Data=pd.DataFrame(columns = col_names)
businessfiles = [f for f in listdir(business_path) if isfile(join(business_path, f))]
entertainmentfiles = [f for f in listdir(entertainment_path) if isfile(join(entertainment_path, f))]
politicsfiles = [f for f in listdir(politics_path) if isfile(join(politics_path, f))]
sportfiles = [f for f in listdir(sport_path) if isfile(join(sport_path, f))]
techfiles=[f for f in listdir(tech_path) if isfile(join(tech_path, f))]
In [4]:
Category='business'
for i in businessfiles:
    val=business_path+'/'+i
    with open(val) as f:
        lines = f.read().splitlines()
    title=lines[0]
    Content=""
    for j in range(1,len(lines)):
        Content=Content+" "+lines[j]
    size=Data.shape[0]
    Data.loc[size]=[unique_num,title,Content,Category]
    unique_num=unique_num+1
Category='entertainment'
for i in entertainmentfiles:
    val=entertainment_path+'/'+i
    with open(val) as f:
        lines = f.read().splitlines()
    title=lines[0]
    Content=""
    for j in range(1,len(lines)):
        Content=Content+" "+lines[j]
    size=Data.shape[0]
    Data.loc[size]=[unique_num,title,Content,Category]
    unique_num=unique_num+1
Category='politics'
for i in politicsfiles:
    val=politics_path+'/'+i
    with open(val) as f:
        lines = f.read().splitlines()
    title=lines[0]
    Content=""
    for j in range(1,len(lines)):
        Content=Content+" "+lines[j]
    size=Data.shape[0]
    Data.loc[size]=[unique_num,title,Content,Category]
    unique_num=unique_num+1
Category='sport'
for i in sportfiles:
    val=sport_path+'/'+i
    with open(val,encoding="utf8", errors='ignore') as f:
        lines = f.read().splitlines()
    title=lines[0]
    Content=""
    for j in range(1,len(lines)):
        Content=Content+" "+lines[j]
    size=Data.shape[0]
    Data.loc[size]=[unique_num,title,Content,Category]
    unique_num=unique_num+1
Category='tech'
for i in techfiles:
    val=tech_path+'/'+i
    with open(val) as f:
        lines = f.read().splitlines()
    title=lines[0]
    Content=""
    for j in range(1,len(lines)):
        Content=Content+" "+lines[j]
    size=Data.shape[0]
    Data.loc[size]=[unique_num,title,Content,Category]
    unique_num=unique_num+1
In [5]:
train, test = train_test_split(Data,test_size=0.2,stratify=Data[['Category']])
In [6]:
train.to_csv('train_set.tsv', sep='\t')
y_Correct=test['Category']
test=test.drop('Category',1)
test.to_csv('test_set.tsv', sep='\t')
In [7]:
#1
str1=""
str2=""
str3=""
str4=""
str5=""
w1=Data[Data["Category"]=='business']
w2=Data[Data["Category"]=='entertainment']
w3=Data[Data["Category"]=='politics']
w4=Data[Data["Category"]=='sport']
w5=Data[Data["Category"]=='tech']
for t in w1['Content']:
    str1=str1+" "+ t
for t in w2['Content']:
    str2=str2+" " + t
for t in w3['Content']:
    str3=str3+" " + t
for t in w4['Content']:
    str4=str4+" "+ t
for t in w5['Content']:
    str5=str5+" "+ t
In [8]:
wordcloud1 = WordCloud(width=3000, height=3000,background_color="white").generate(str1) 
plt.figure(figsize=(100,100))
plt.imshow(wordcloud1)
plt.axis("off")
plt.show()
In [9]:
wordcloud2 = WordCloud(width=3000, height=3000,background_color="white").generate(str2) 
plt.figure(figsize=(100,100))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()
In [10]:
wordcloud3 = WordCloud(width=3000, height=3000,background_color="white").generate(str3) 
plt.figure(figsize=(100,100))
plt.imshow(wordcloud3)
plt.axis("off")
plt.show()
In [11]:
wordcloud4 = WordCloud(width=3000, height=3000,background_color="white").generate(str4) 
plt.figure(figsize=(100,100))
plt.imshow(wordcloud4)
plt.axis("off")
plt.show()
In [12]:
wordcloud5 = WordCloud(width=3000, height=3000,background_color="white").generate(str5) 
plt.figure(figsize=(100,100))
plt.imshow(wordcloud5)
plt.axis("off")
plt.show()
In [13]:
#2
x_train=train['Content']
y_train=train['Category']
x_test=test['Content']
###Input1
count_train = CountVectorizer()
count1= count_train.fit_transform(x_train)
count2=count_train.transform(x_test) 
###Input2
tf_train=TfidfVectorizer()
x = tf_train.fit_transform(x_train)
y=tf_train.transform(x_test)
warnings.filterwarnings('ignore')
In [14]:
####Support Vector Machine -Input1
C=[0.1, 1, 10, 100, 1000]
num1=np.random.randint(0,3)
gamma=[1, 0.1, 0.01, 0.001, 0.0001]
num2=np.random.randint(0,3)
kernel=['rbf','linear']
num3=np.random.randint(0,1)
kernel_input=kernel[num3]
gamma_input=gamma[num2]
C_input=C[num1]
clf = svm.SVC(kernel=kernel_input,gamma=gamma_input,C=C_input,probability=True) # Linear Kernel
clf.fit(count1, y_train)
y_pred1 = clf.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 42.98%
recall: 39.86%
Precision: 78.51%
F-Measure:39.65%
	AUC(macro):  0.6293161351852112
Out[14]:
<matplotlib.legend.Legend at 0x7fc9d04cb490>
In [15]:
####Support Vector Machine -Input2
C=[0.1, 1, 10, 100, 1000]
num1=np.random.randint(0,3)
gamma=[1, 0.1, 0.01, 0.001, 0.0001]
num2=np.random.randint(0,3)
kernel=['rbf','linear']
num3=np.random.randint(0,1)
kernel_input=kernel[num3]
gamma_input=gamma[num2]
C_input=C[num1]
clf = svm.SVC(kernel=kernel_input,gamma=gamma_input,C=C_input,probability=True) # Linear Kernel
clf.fit(x, y_train)
y_pred2=clf.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 97.81%
recall: 97.80%
Precision: 97.85%
F-Measure:97.81%
	AUC(macro):  0.6034659927415
Out[15]:
<matplotlib.legend.Legend at 0x7fc9cb2e6f50>
In [16]:
###Random Forests -Input1
clf=RandomForestClassifier()
clf.fit(count1,y_train)
y_pred1 = clf.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 95.11%
recall: 94.41%
Precision: 95.43%
F-Measure:95.50%
	AUC(macro):  0.602164141947314
Out[16]:
<matplotlib.legend.Legend at 0x7fc9d035ec90>
In [17]:
###Random Forests -Input2
clf=RandomForestClassifier()
clf.fit(x, y_train)
y_pred2=clf.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 95.51%
recall: 95.18%
Precision: 95.55%
F-Measure:94.98%
	AUC(macro):  0.5934013162969456
Out[17]:
<matplotlib.legend.Legend at 0x7fc9d067ccd0>
In [18]:
##Naive Bayes-Input1
gnb = GaussianNB()
count1=count1.toarray()
gnb.fit(count1,y_train)
count2=count2.toarray()
y_pred1= gnb.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=gnb.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 95.28%
recall: 94.84%
Precision: 95.78%
F-Measure:94.94%
	AUC(macro):  0.5981244907670751
Out[18]:
<matplotlib.legend.Legend at 0x7fc9d0617c50>
In [19]:
##Naive Bayes-Input2
gnb = GaussianNB()
x=x.toarray()
gnb.fit(x, y_train)
y=y.toarray()
y_pred2=gnb.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=gnb.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])      
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
            label='macro-average ROC curve (area = {0:0.2f})'
         ''.format(roc_auc["macro"]),
        color='navy', linestyle=':', linewidth=4)

colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
            label='ROC curve of class {0} (area = {1:0.2f})'
            ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
Accuracy: 95.67%
recall: 95.23%
Precision: 94.75%
F-Measure:95.54%
	AUC(macro):  0.595847884812858
Out[19]:
<matplotlib.legend.Legend at 0x7fc9d0604a10>
In [20]:
##K-Nearest Neighbor-Input1

val1=x_test.shape
val=val1[0]
k=3
inputtrain=count_train.transform(np.array(x_train))
inputtest=count_train.transform(np.array(x_test))
def predict(X_train, y_train, X_test, k,g):
    distances = []
    topKcategories = []
    for i in range(g):
        distance = sklearn.metrics.pairwise.cosine_distances(X_test, X_train[i])
        distance = distance[0][0]
        distances.append([distance, i])
    distances = sorted(distances)
    for i in range(k):
        index = distances[i][1]
        topKcategories.append(y_train[index])
    most_common_category = Counter(topKcategories).most_common(1)[0][0]
    return most_common_category
total_Accuracy=0.0
total_precision=0.0
total_recall=0.0
total_f1=0.0
for kf in range(10):
    predictions = []
    for i in range(val):
        g=x_train.shape[0]
        predictions.append(predict(inputtrain, np.array(y_train), inputtest[i], k,g))
    predictions = np.asarray(predictions)
    y_score1=predictions
    Accuracy=accuracy_score(y_Correct,y_score1)
    precision=precision_score(y_Correct,y_score1, average='macro')
    recall=recall_score(y_Correct,y_score1, average='macro')
    f1=f1_score(y_Correct,y_score1, average='macro')
    total_Accuracy+=Accuracy
    total_precision+=precision
    total_recall+=recall
    total_f1+=f1
total_Accuracy/=10
total_precision/=10
total_recall/=10
total_f1/=10
print("Accuracy: %.2f%%" % (total_Accuracy.mean()*100.0))
print("Precision: %.2f%%" % (total_precision.mean()*100.0))
print("Recall: %.2f%%" % (total_recall.mean()*100.0))
print("F-Measure: %.2f%%" % (total_f1.mean()*100.0))
Accuracy: 79.55%
Precision: 80.02%
Recall: 78.73%
F-Measure: 78.87%
In [21]:
##K-Nearest Neighbor-Input2
val1=x_test.shape
val=val1[0]
k=3
x = tf_train.transform(np.array(x_train))
y=tf_train.transform(np.array(x_test))
def predict(X_train, y_train, X_test, k,g):
    distances = []
    topKcategories = []
    for i in range(g):
        distance = sklearn.metrics.pairwise.cosine_distances(X_test, X_train[i])
        distance = distance[0][0]
        distances.append([distance, i])
    distances = sorted(distances)
    for i in range(k):
        index = distances[i][1]
        topKcategories.append(y_train[index])
    most_common_category = Counter(topKcategories).most_common(1)[0][0]
    return most_common_category
total_Accuracy=0.0
total_precision=0.0
total_recall=0.0
total_f1=0.0
for kf in range(10):
    predictions1 = []
    for i in range(val):
        g=x_train.shape[0]
        predictions1.append(predict(x,np.array(y_train), y[i], k,g))
    predictions1 = np.asarray(predictions1)
    y_score2=predictions1
    Accuracy=accuracy_score(y_Correct,y_score2)
    precision=precision_score(y_Correct,y_score2, average='macro')
    recall=recall_score(y_Correct,y_score2, average='macro')
    f1=f1_score(y_Correct,y_score2, average='macro')
    total_Accuracy+=Accuracy
    total_precision+=precision
    total_recall+=recall
    total_f1+=f1
total_Accuracy/=10
total_precision/=10
total_recall/=10
total_f1/=10
print("Accuracy: %.2f%%" % (total_Accuracy.mean()*100.0))
print("Precision: %.2f%%" % (total_precision.mean()*100.0))
print("Recall: %.2f%%" % (total_recall.mean()*100.0))
print("F-Measure: %.2f%%" % (total_f1.mean()*100.0))
Accuracy: 89.66%
Precision: 90.00%
Recall: 89.73%
F-Measure: 89.56%
In [ ]:
 
In [22]:
#4
#input1
count1 = CountVectorizer()
in1 = count1.fit_transform(test['Content'])
#input2
tfidf=TfidfVectorizer()
in2= tfidf.fit_transform(test['Content'])
#input3
model = Word2Vec(test['Content'], min_count=1)
in3 = model[model.wv.vocab]
In [23]:
##input1
In1_Norm = preprocessing.normalize(in1)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In1_Norm)
predicted_value1=km2.predict(In1_Norm)
##input2
In2_Norm = preprocessing.normalize(in2)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In2_Norm)
predicted_value2=km2.predict(In2_Norm)
##input3
In3_Norm = preprocessing.normalize(in3)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In3_Norm)
predicted_value3=km2.predict(In3_Norm)
In [ ]:

In [24]:
#Input1
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In1_Norm)
test['Category']=km.labels_
test['Segment']=test['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=test['Category']
seaborn.set(rc={'figure.figsize':(8,5)})
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input1.png',dpi=1000)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    339                 pass
    340             else:
--> 341                 return printer(obj)
    342             # Finally look for special method names
    343             method = get_real_method(obj, self.print_method)

~/anaconda3/lib/python3.7/site-packages/IPython/core/pylabtools.py in <lambda>(fig)
    246 
    247     if 'png' in formats:
--> 248         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    249     if 'retina' in formats or 'png2x' in formats:
    250         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))

~/anaconda3/lib/python3.7/site-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs)
    130         FigureCanvasBase(fig)
    131 
--> 132     fig.canvas.print_figure(bytes_io, **kw)
    133     data = bytes_io.getvalue()
    134     if fmt == 'svg':

~/anaconda3/lib/python3.7/site-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, **kwargs)
   2089                     orientation=orientation,
   2090                     bbox_inches_restore=_bbox_inches_restore,
-> 2091                     **kwargs)
   2092             finally:
   2093                 if bbox_inches and restore_bbox:

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args, **kwargs)
    525 
    526         else:
--> 527             FigureCanvasAgg.draw(self)
    528             renderer = self.get_renderer()
    529             with cbook._setattr_cm(renderer, dpi=self.figure.dpi), \

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in draw(self)
    384         Draw the figure using the renderer.
    385         """
--> 386         self.renderer = self.get_renderer(cleared=True)
    387         with RendererAgg.lock:
    388             self.figure.draw(self.renderer)

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in get_renderer(self, cleared)
    397                           and getattr(self, "_lastKey", None) == key)
    398         if not reuse_renderer:
--> 399             self.renderer = RendererAgg(w, h, self.figure.dpi)
    400             self._lastKey = key
    401         elif cleared:

~/anaconda3/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py in __init__(self, width, height, dpi)
     84         self.width = width
     85         self.height = height
---> 86         self._renderer = _RendererAgg(int(width), int(height), dpi)
     87         self._filter_renderers = []
     88 

ValueError: Image size of 76198x343 pixels is too large. It must be less than 2^16 in each direction.
<Figure size 576x360 with 1 Axes>
In [ ]:
#Input2
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In2_Norm)
test['Category']=km.labels_
test['Segment']=test['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=test['Category']
fig=plt.figure(figsize=(6,3))
fig.set_size_inches(5, 8)
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input2.png',dpi=1000)
In [27]:
#Input3
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In3_Norm)
e=pd.DataFrame(columns=['Category'])
e['Category']=km.labels_
test['Segment']=e['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=e['Category']
seaborn.set(rc={'figure.figsize':(8,5)})
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input3.png',dpi=1000)
In [ ]:
###
#ΟΝΟΜΑΤΕΠΩΝΥΜΟ:ΠΡΟΚΟΠΙΟΣ ΣΤΑΜΕΛΙΑΣ
#ΑΡΙΘΜΟΣ ΜΗΤΡΩΟΥ:1115201400190
#Δεν έχει υλοποιηθεί το roc plot για το knn classification 
#Έχει ένα error στην εμφάνιση σε 2d η συμπίεση με τις εικόνες στο size δεν κατάφερα να το επιλύσω παρόλο που εμφανίζονται οι εικόνες !
#Τα αποτελέσματα στη μέθοδο συμπίεσης δεν εμφανίζονται στη οθόνη αλλά σε .png εικόνες !Είναι μέσα στο φάκελο της εργασίας που απέστειλα  και οι τρεις εικόνες αντίστοιχα για τις αναπαραστάσεις των κιμένων!
#1-CountVectorizer
#2-Tfidf
#3-Word2vec
#Επίσης στο input2 βγάζει το ίδιο error με το input1 αλλά επειδή το έτρεξα πάλι τελευταία στιγνή για να προλάβω να το στείλω δεν ολοκληρώθηκε !
#
#
##
In [ ]: